In [1]:
import pandas as pd

# Define the file path
file_path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-AI0272EN-SkillsNetwork/labs/dataset/2016.csv"

try:
    # Read the CSV file into a pandas DataFrame
    # The first row is automatically used as headers by default
    df = pd.read_csv(file_path)
    
    # Print the first 5 rows to verify correct loading
    print("First 5 rows of the dataset:")
    print(df.head())
    
    # Optional: Print basic information about the dataset
    print(f"\nDataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # DATA PREPARATION - Check and correct data types
    print("\n" + "="*50)
    print("DATA PREPARATION")
    print("="*50)
    
    # 1. Check the data types of all columns
    print("\n1. Current data types:")
    print(df.dtypes)
    print("\n")
    
    # Display info about the dataset including data types and null values
    print("Dataset info:")
    print(df.info())
    
    # Check for any obvious data type issues
    print(f"\n2. Data type analysis:")
    
    # Check each column and identify potential issues
    for col in df.columns:
        print(f"\nColumn '{col}':")
        print(f"  - Data type: {df[col].dtype}")
        print(f"  - Null values: {df[col].isnull().sum()}")
        print(f"  - Unique values: {df[col].nunique()}")
        
        # Show sample values for better understanding
        if df[col].dtype == 'object':
            print(f"  - Sample values: {df[col].dropna().head(3).tolist()}")
        else:
            print(f"  - Value range: {df[col].min()} to {df[col].max()}")
    
    # 3. Identify and fix common data type issues
    print(f"\n3. Data type corrections:")
    
    # Store original dtypes for comparison
    original_dtypes = df.dtypes.copy()
    
    # Common corrections based on typical dataset patterns
    corrections_made = []
    
    # Check for columns that should be numeric but are stored as objects
    for col in df.columns:
        if df[col].dtype == 'object':
            # Try to convert to numeric, errors='coerce' will convert non-numeric to NaN
            try:
                # First, let's see if it can be converted to numeric
                numeric_conversion = pd.to_numeric(df[col], errors='coerce')
                
                # If less than 50% of values become NaN, it's likely a numeric column
                nan_percentage = numeric_conversion.isnull().sum() / len(df)
                
                if nan_percentage < 0.5:  # If less than 50% are NaN after conversion
                    df[col] = numeric_conversion
                    corrections_made.append(f"'{col}': object → numeric")
                    
            except:
                pass
    
    # Check for columns that might be dates
    for col in df.columns:
        if df[col].dtype == 'object' and col.lower() in ['date', 'time', 'year', 'month']:
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                if not df[col].isnull().all():  # If conversion was successful for some values
                    corrections_made.append(f"'{col}': object → datetime")
            except:
                pass
    
    # Display corrections made
    if corrections_made:
        print("Corrections applied:")
        for correction in corrections_made:
            print(f"  - {correction}")
    else:
        print("No automatic corrections were needed or could be safely applied.")
    
    # 4. Display final data types after corrections
    print(f"\n4. Data types after corrections:")
    final_dtypes = df.dtypes
    
    print("\nComparison of data types:")
    print(f"{'Column':<20} {'Original':<15} {'Current':<15} {'Changed'}")
    print("-" * 65)
    
    for col in df.columns:
        original = str(original_dtypes[col])
        current = str(final_dtypes[col])
        changed = "Yes" if original != current else "No"
        print(f"{col:<20} {original:<15} {current:<15} {changed}")
    
    # 5. Final dataset summary
    print(f"\n5. Final dataset summary:")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage().sum()} bytes")
    print(f"Total null values: {df.isnull().sum().sum()}")
    
    # 6. MISSING VALUES ANALYSIS AND TREATMENT
    print("\n" + "="*50)
    print("MISSING VALUES ANALYSIS AND TREATMENT")
    print("="*50)
    
    # Identify columns with missing values
    print("\n6. Missing values analysis:")
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    
    # Create a summary of missing values
    missing_summary = pd.DataFrame({
        'Column': missing_values.index,
        'Missing_Count': missing_values.values,
        'Missing_Percentage': missing_percentage.values
    })
    
    # Filter to show only columns with missing values
    columns_with_missing = missing_summary[missing_summary['Missing_Count'] > 0]
    
    if len(columns_with_missing) > 0:
        print("Columns with missing values:")
        print(columns_with_missing.to_string(index=False))
        
        # 7. Fill missing values with mean for numeric columns
        print(f"\n7. Filling missing values with mean:")
        
        filled_columns = []
        skipped_columns = []
        
        for col in df.columns:
            if df[col].isnull().sum() > 0:  # If column has missing values
                if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:  # Numeric columns
                    # Calculate mean excluding NaN values
                    mean_value = df[col].mean()
                    
                    # Fill missing values with mean
                    df[col].fillna(mean_value, inplace=True)
                    
                    filled_columns.append({
                        'column': col,
                        'mean_value': round(mean_value, 4),
                        'filled_count': missing_values[col]
                    })
                    
                else:  # Non-numeric columns
                    skipped_columns.append({
                        'column': col,
                        'data_type': str(df[col].dtype),
                        'missing_count': missing_values[col]
                    })
        
        # Display results of filling operation
        if filled_columns:
            print("\nNumeric columns filled with mean values:")
            for item in filled_columns:
                print(f"  - '{item['column']}': {item['filled_count']} missing values filled with mean = {item['mean_value']}")
        
        if skipped_columns:
            print(f"\nNon-numeric columns skipped (cannot use mean):")
            for item in skipped_columns:
                print(f"  - '{item['column']}' ({item['data_type']}): {item['missing_count']} missing values")
                
        # 8. Verify missing values after treatment
        print(f"\n8. Missing values verification after treatment:")
        remaining_missing = df.isnull().sum()
        remaining_total = remaining_missing.sum()
        
        if remaining_total > 0:
            print("Remaining missing values:")
            remaining_summary = remaining_missing[remaining_missing > 0]
            for col, count in remaining_summary.items():
                percentage = (count / len(df)) * 100
                print(f"  - '{col}': {count} missing values ({percentage:.2f}%)")
        else:
            print("✓ All missing values in numeric columns have been successfully filled!")
            
        # Summary of missing value treatment
        print(f"\nMissing value treatment summary:")
        print(f"  - Total columns processed: {len(df.columns)}")
        print(f"  - Columns with missing values before: {len(columns_with_missing)}")
        print(f"  - Numeric columns filled with mean: {len(filled_columns)}")
        print(f"  - Non-numeric columns skipped: {len(skipped_columns)}")
        print(f"  - Remaining missing values: {remaining_total}")
        
    else:
        print("✓ No missing values found in the dataset!")
    
    # 9. Final dataset status
    print(f"\n" + "="*50)
    print("FINAL DATASET STATUS")
    print("="*50)
    
    print(f"Shape: {df.shape}")
    print(f"Total missing values: {df.isnull().sum().sum()}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum():,} bytes")
    
    print(f"\nData types summary:")
    dtype_counts = df.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"  - {dtype}: {count} columns")
    
    print("\nData preparation and missing value treatment completed successfully!")
    print("Dataset is now ready for analysis!")
    
    # 10. TOP 10 COUNTRIES ANALYSIS AND VISUALIZATION
    print(f"\n" + "="*50)
    print("TOP 10 COUNTRIES ANALYSIS")
    print("="*50)
    
    # First, let's examine the column names to identify the relevant columns
    print(f"\nAvailable columns in the dataset:")
    for i, col in enumerate(df.columns, 1):
        print(f"{i:2d}. {col}")
    
    # Try to identify the relevant columns for analysis
    # Common column names for happiness/wellbeing datasets
    possible_rank_cols = [col for col in df.columns if 'rank' in col.lower() or 'happiness' in col.lower()]
    possible_country_cols = [col for col in df.columns if 'country' in col.lower() or 'nation' in col.lower()]
    possible_gdp_cols = [col for col in df.columns if 'gdp' in col.lower() or 'economy' in col.lower()]
    possible_health_cols = [col for col in df.columns if 'health' in col.lower() or 'life' in col.lower()]
    
    print(f"\nIdentified potential columns:")
    print(f"Country columns: {possible_country_cols}")
    print(f"Ranking columns: {possible_rank_cols}")
    print(f"GDP columns: {possible_gdp_cols}")
    print(f"Health columns: {possible_health_cols}")
    
        # Try to select the most appropriate columns (adjust these based on actual column names)
    try:
        # Attempt to identify columns automatically
        country_col = possible_country_cols[0] if possible_country_cols else df.columns[0]
        
        # For GDP per capita - look for GDP-related columns
        gdp_col = None
        for col in df.columns:
            if any(term in col.lower() for term in ['gdp', 'economy', 'economic']):
                gdp_col = col
                break
        
        # For Healthy Life Expectancy - look for health/life-related columns
        health_col = None
        for col in df.columns:
            if any(term in col.lower() for term in ['health', 'life', 'expectancy']):
                health_col = col
                break
        
        # For ranking - look for happiness score or similar
        score_col = None
        for col in df.columns:
            if any(term in col.lower() for term in ['score', 'happiness', 'rank']):
                if 'rank' not in col.lower():  # Prefer score over rank
                    score_col = col
                    break
        
        if not score_col:  # If no score found, look for rank
            for col in df.columns:
                if 'rank' in col.lower():
                    score_col = col
                    break
        
        print(f"\nSelected columns for analysis:")
        print(f"Country: {country_col}")
        print(f"GDP per capita: {gdp_col}")
        print(f"Healthy Life Expectancy: {health_col}")
        print(f"Ranking/Score: {score_col}")
        
        if gdp_col and health_col and score_col:
            # Sort by happiness score (or rank) to get top 10 countries
            if 'rank' in score_col.lower():
                # If it's a rank column, sort ascending (lower rank = better)
                top_10 = df.nsmallest(10, score_col)
            else:
                # If it's a score column, sort descending (higher score = better)
                top_10 = df.nlargest(10, score_col)
            
            print(f"\nTop 10 countries:")
            display_cols = [country_col, score_col, gdp_col, health_col]
            print(top_10[display_cols].to_string(index=False))
            
            # Create visualization using plotly
            import plotly.graph_objects as go
            from plotly.subplots import make_subplots
            
            # Create subplot with secondary y-axis
            fig1 = make_subplots(
                rows=1, cols=1,
                secondary_y=True,
                subplot_titles=('GDP per Capita and Healthy Life Expectancy - Top 10 Countries',)
            )
            
            # Add GDP per capita bar chart
            fig1.add_trace(
                go.Bar(
                    x=top_10[country_col],
                    y=top_10[gdp_col],
                    name='GDP per Capita',
                    marker_color='lightblue',
                    opacity=0.8
                ),
                secondary_y=False,
            )
            
            # Add Healthy Life Expectancy bar chart
            fig1.add_trace(
                go.Bar(
                    x=top_10[country_col],
                    y=top_10[health_col],
                    name='Healthy Life Expectancy',
                    marker_color='lightcoral',
                    opacity=0.8
                ),
                secondary_y=True,
            )
            
            # Update layout
            fig1.update_layout(
                title={
                    'text': 'GDP per Capita and Healthy Life Expectancy - Top 10 Countries',
                    'x': 0.5,
                    'xanchor': 'center',
                    'font': {'size': 16}
                },
                xaxis_title='Countries',
                barmode='group',
                height=600,
                width=1000,
                showlegend=True,
                legend=dict(
                    orientation="h",
                    yanchor="bottom",
                    y=1.02,
                    xanchor="right",
                    x=1
                )
            )
            
            # Set y-axes titles
            fig1.update_yaxes(title_text="GDP per Capita", secondary_y=False)
            fig1.update_yaxes(title_text="Healthy Life Expectancy (Years)", secondary_y=True)
            
            # Rotate x-axis labels for better readability
            fig1.update_xaxes(tickangle=-45)
            
            # Show the plot
            fig1.show()
            
            print(f"\n✓ Bar chart 'fig1' created successfully!")
            print(f"✓ Chart shows GDP per Capita and Healthy Life Expectancy for top 10 countries")
            
            # Save the figure (optional)
            # fig1.write_html("top_10_countries_analysis.html")
            # print(f"✓ Chart saved as 'top_10_countries_analysis.html'")
            
        else:
            print(f"\n⚠️ Could not identify all required columns automatically.")
            print(f"Please check the column names and adjust the code accordingly.")
            print(f"Available columns: {list(df.columns)}")
            
    except Exception as viz_error:
        print(f"Error during visualization: {viz_error}")
        print(f"Please check if the required columns exist in the dataset.")
    
    # 11. DATA EXPLORATION - ADVANCED VISUALIZATIONS
    print(f"\n" + "="*50)
    print("DATA EXPLORATION - ADVANCED VISUALIZATIONS")
    print("="*50)
    
    try:
        # Import additional libraries for advanced visualizations
        import plotly.express as px
        import plotly.figure_factory as ff
        import numpy as np
        
        # 1. Create sub-dataset with specific attributes
        print(f"\n1. Creating sub-dataset with key attributes...")
        
        # Try to identify the correct column names
        attr_mapping = {}
        
        # Economy (GDP per Capita)
        for col in df.columns:
            if any(term in col.lower() for term in ['gdp', 'economy', 'economic']):
                attr_mapping['Economy'] = col
                break
        
        # Family
        for col in df.columns:
            if any(term in col.lower() for term in ['family', 'social']):
                attr_mapping['Family'] = col
                break
        
        # Health (Life Expectancy)
        for col in df.columns:
            if any(term in col.lower() for term in ['health', 'life']):
                attr_mapping['Health'] = col
                break
        
        # Freedom
        for col in df.columns:
            if 'freedom' in col.lower():
                attr_mapping['Freedom'] = col
                break
        
        # Trust (Government Corruption)
        for col in df.columns:
            if any(term in col.lower() for term in ['trust', 'corruption']):
                attr_mapping['Trust'] = col
                break
        
        # Generosity
        for col in df.columns:
            if 'generosity' in col.lower():
                attr_mapping['Generosity'] = col
                break
        
        # Happiness Score
        for col in df.columns:
            if any(term in col.lower() for term in ['happiness', 'score']) and 'rank' not in col.lower():
                attr_mapping['Happiness_Score'] = col
                break
        
        # Region
        for col in df.columns:
            if 'region' in col.lower():
                attr_mapping['Region'] = col
                break
        
        # Country
        for col in df.columns:
            if 'country' in col.lower():
                attr_mapping['Country'] = col
                break
        
        print(f"Identified attribute mappings:")
        for key, value in attr_mapping.items():
            print(f"  {key}: {value}")
        
        # Create sub-dataset
        required_attrs = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Happiness_Score']
        available_attrs = [attr for attr in required_attrs if attr in attr_mapping]
        
        if len(available_attrs) >= 4:  # Need at least 4 attributes for meaningful analysis
            sub_cols = [attr_mapping[attr] for attr in available_attrs]
            sub_df = df[sub_cols].copy()
            
            # Rename columns for clarity
            rename_dict = {attr_mapping[attr]: attr for attr in available_attrs}
            sub_df = sub_df.rename(columns=rename_dict)
            
            print(f"\nSub-dataset created with {len(sub_df.columns)} attributes:")
            print(sub_df.head())
            
            # 2. Create correlation heatmap (fig2)
            print(f"\n2. Creating correlation heatmap...")
            
            # Calculate correlation matrix
            corr_matrix = sub_df.corr()
            
            # Create heatmap using plotly
            fig2 = go.Figure(data=go.Heatmap(
                z=corr_matrix.values,
                x=corr_matrix.columns,
                y=corr_matrix.columns,
                colorscale='RdBu',
                zmid=0,
                text=np.round(corr_matrix.values, 2),
                texttemplate="%{text}",
                textfont={"size": 10},
                hoverongaps=False
            ))
            
            fig2.update_layout(
                title={
                    'text': 'Correlation Heatmap of Happiness Attributes',
                    'x': 0.5,
                    'xanchor': 'center',
                    'font': {'size': 16}
                },
                width=800,
                height=600,
                xaxis_title="Attributes",
                yaxis_title="Attributes"
            )
            
            fig2.show()
            print("✓ Correlation heatmap 'fig2' created successfully!")
            
            # 3. Create scatter plot between Happiness Score and GDP per Capita (fig3)
            print(f"\n3. Creating scatter plot...")
            
            if 'Happiness_Score' in sub_df.columns and 'Economy' in sub_df.columns and 'Region' in attr_mapping:
                # Prepare data for scatter plot
                scatter_data = df.copy()
                
                fig3 = px.scatter(
                    scatter_data,
                    x=attr_mapping['Economy'],
                    y=attr_mapping['Happiness_Score'],
                    color=attr_mapping['Region'] if 'Region' in attr_mapping else None,
                    hover_data=[attr_mapping['Country']] if 'Country' in attr_mapping else None,
                    title='Happiness Score vs GDP per Capita by Region',
                    labels={
                        attr_mapping['Economy']: 'GDP per Capita',
                        attr_mapping['Happiness_Score']: 'Happiness Score'
                    }
                )
                
                fig3.update_layout(
                    title={
                        'x': 0.5,
                        'xanchor': 'center',
                        'font': {'size': 16}
                    },
                    width=900,
                    height=600
                )
                
                fig3.show()
                print("✓ Scatter plot 'fig3' created successfully!")
            else:
                print("⚠️ Cannot create scatter plot - missing required columns")
            
            # 4. Create pie chart for Happiness Score by Region (fig4)
            print(f"\n4. Creating pie chart...")
            
            if 'Region' in attr_mapping and 'Happiness_Score' in attr_mapping:
                # Calculate average happiness score by region
                region_happiness = df.groupby(attr_mapping['Region'])[attr_mapping['Happiness_Score']].mean().reset_index()
                
                fig4 = px.pie(
                    region_happiness,
                    values=attr_mapping['Happiness_Score'],
                    names=attr_mapping['Region'],
                    title='Average Happiness Score by Region'
                )
                
                fig4.update_layout(
                    title={
                        'x': 0.5,
                        'xanchor': 'center',
                        'font': {'size': 16}
                    },
                    width=800,
                    height=600
                )
                
                fig4.show()
                print("✓ Pie chart 'fig4' created successfully!")
            else:
                print("⚠️ Cannot create pie chart - missing Region or Happiness Score columns")
            
            # 5. Create world map for GDP per capita with Health tooltip (fig5)
            print(f"\n5. Creating world map...")
            
            if 'Country' in attr_mapping and 'Economy' in attr_mapping:
                # Create world map
                fig5 = px.choropleth(
                    df,
                    locations=attr_mapping['Country'],
                    color=attr_mapping['Economy'],
                    hover_name=attr_mapping['Country'],
                    hover_data={
                        attr_mapping['Health']: True if 'Health' in attr_mapping else False,
                        attr_mapping['Economy']: ':.2f'
                    },
                    color_continuous_scale='Viridis',
                    locationmode='country names',
                    title='GDP per Capita by Country (with Health Life Expectancy tooltip)'
                )
                
                fig5.update_layout(
                    title={
                        'x': 0.5,
                        'xanchor': 'center',
                        'font': {'size': 16}
                    },
                    width=1000,
                    height=600,
                    geo=dict(showframe=False, showcoastlines=True)
                )
                
                fig5.show()
                print("✓ World map 'fig5' created successfully!")
            else:
                print("⚠️ Cannot create world map - missing Country or Economy columns")
            
            print(f"\n" + "="*50)
            print("DATA EXPLORATION COMPLETED")
            print("="*50)
            print("✓ All visualizations created successfully!")
            print("✓ fig1: Top 10 Countries GDP & Health Bar Chart")
            print("✓ fig2: Correlation Heatmap")
            print("✓ fig3: Happiness vs GDP Scatter Plot")
            print("✓ fig4: Happiness by Region Pie Chart")
            print("✓ fig5: GDP World Map with Health Tooltip")
            
            # 12. CREATE HTML DASHBOARD
            print(f"\n" + "="*50)
            print("CREATING HTML DASHBOARD")
            print("="*50)
            
            # Convert plots to HTML strings
            fig1_html = fig1.to_html(include_plotlyjs='cdn', div_id='fig1') if 'fig1' in locals() else ""
            fig2_html = fig2.to_html(include_plotlyjs='cdn', div_id='fig2') if 'fig2' in locals() else ""
            fig3_html = fig3.to_html(include_plotlyjs='cdn', div_id='fig3') if 'fig3' in locals() else ""
            fig4_html = fig4.to_html(include_plotlyjs='cdn', div_id='fig4') if 'fig4' in locals() else ""
            fig5_html = fig5.to_html(include_plotlyjs='cdn', div_id='fig5') if 'fig5' in locals() else ""
            
            # Create comprehensive HTML dashboard
            dashboard_html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>World Happiness Analysis Dashboard 2016</title>
    <style>
        body {{
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            margin: 0;
            padding: 0;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: #333;
            line-height: 1.6;
        }}
        
        .container {{
            max-width: 1400px;
            margin: 0 auto;
            padding: 20px;
        }}
        
        .header {{
            background: rgba(255, 255, 255, 0.95);
            border-radius: 15px;
            padding: 40px;
            margin-bottom: 30px;
            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
            text-align: center;
        }}
        
        .header h1 {{
            color: #2c3e50;
            font-size: 2.5em;
            margin-bottom: 10px;
            font-weight: 700;
        }}
        
        .header .subtitle {{
            color: #7f8c8d;
            font-size: 1.2em;
            margin-bottom: 20px;
        }}
        
        .executive-summary {{
            background: rgba(255, 255, 255, 0.95);
            border-radius: 15px;
            padding: 30px;
            margin-bottom: 30px;
            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
        }}
        
        .section {{
            background: rgba(255, 255, 255, 0.95);
            border-radius: 15px;
            padding: 30px;
            margin-bottom: 30px;
            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
        }}
        
        .section h2 {{
            color: #2c3e50;
            border-bottom: 3px solid #3498db;
            padding-bottom: 10px;
            margin-bottom: 20px;
            font-size: 1.8em;
        }}
        
        .section h3 {{
            color: #34495e;
            margin-top: 25px;
            margin-bottom: 15px;
            font-size: 1.3em;
        }}
        
        .visualization {{
            margin: 20px 0;
            border-radius: 10px;
            overflow: hidden;
            box-shadow: 0 5px 15px rgba(0,0,0,0.1);
        }}
        
        .insights {{
            background: #f8f9fa;
            border-left: 5px solid #3498db;
            padding: 20px;
            margin: 20px 0;
            border-radius: 0 10px 10px 0;
        }}
        
        .insights h4 {{
            color: #2980b9;
            margin-top: 0;
            font-size: 1.1em;
        }}
        
        .key-findings {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
            gap: 20px;
            margin: 20px 0;
        }}
        
        .finding-card {{
            background: #ecf0f1;
            padding: 20px;
            border-radius: 10px;
            border-left: 5px solid #e74c3c;
        }}
        
        .finding-card h5 {{
            color: #c0392b;
            margin-top: 0;
            font-size: 1em;
        }}
        
        .methodology {{
            background: #fdf6e3;
            border: 1px solid #f1c40f;
            border-radius: 10px;
            padding: 20px;
            margin: 20px 0;
        }}
        
        .footer {{
            text-align: center;
            padding: 30px;
            color: #fff;
            background: rgba(0,0,0,0.1);
            border-radius: 15px;
            margin-top: 30px;
        }}
        
        .stats-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 15px;
            margin: 20px 0;
        }}
        
        .stat-card {{
            background: #3498db;
            color: white;
            padding: 20px;
            border-radius: 10px;
            text-align: center;
        }}
        
        .stat-number {{
            font-size: 2em;
            font-weight: bold;
            display: block;
        }}
        
        .stat-label {{
            font-size: 0.9em;
            opacity: 0.9;
        }}
    </style>
</head>
<body>
    <div class="container">
        <!-- Header Section -->
        <div class="header">
            <h1>🌍 World Happiness Analysis Dashboard</h1>
            <div class="subtitle">Comprehensive Analysis of Global Well-being Indicators • 2016 Data</div>
            <div class="stats-grid">
                <div class="stat-card">
                    <span class="stat-number">{len(df)}</span>
                    <span class="stat-label">Countries Analyzed</span>
                </div>
                <div class="stat-card">
                    <span class="stat-number">{len(df.columns)}</span>
                    <span class="stat-label">Data Attributes</span>
                </div>
                <div class="stat-card">
                    <span class="stat-number">{len(df[attr_mapping['Region']].unique()) if 'Region' in attr_mapping else 'N/A'}</span>
                    <span class="stat-label">Global Regions</span>
                </div>
                <div class="stat-card">
                    <span class="stat-number">5</span>
                    <span class="stat-label">Visualizations</span>
                </div>
            </div>
        </div>
        
        <!-- Executive Summary -->
        <div class="executive-summary">
            <h2>📊 Executive Summary</h2>
            <p><strong>This comprehensive dashboard presents an in-depth analysis of global happiness and well-being indicators for 2016.</strong> Our analysis reveals critical insights into the factors that contribute to national happiness, including economic prosperity, social support, health outcomes, personal freedom, trust in government, and generosity.</p>
            
            <div class="key-findings">
                <div class="finding-card">
                    <h5>🏆 Top Performers</h5>
                    <p>Nordic countries and wealthy nations dominate the happiness rankings, with strong correlations between GDP per capita and overall well-being scores.</p>
                </div>
                <div class="finding-card">
                    <h5>🔗 Strong Correlations</h5>
                    <p>Economic factors, family support, and health life expectancy show the strongest positive correlations with happiness scores.</p>
                </div>
                <div class="finding-card">
                    <h5>🌏 Regional Patterns</h5>
                    <p>Clear regional clustering emerges, with Western Europe and North America leading in multiple well-being dimensions.</p>
                </div>
                <div class="finding-card">
                    <h5>🎯 Policy Implications</h5>
                    <p>The data suggests that balanced investment in economic development, healthcare, and social systems yields the highest happiness returns.</p>
                </div>
            </div>
        </div>
        
        <!-- Visualization 1: Top 10 Countries -->
        <div class="section">
            <h2>🏅 Top 10 Happiest Countries: Economic & Health Perspective</h2>
            <p>This comparative analysis highlights the relationship between economic prosperity (GDP per capita) and health outcomes (life expectancy) among the world's happiest nations. The visualization reveals that top-performing countries excel in both dimensions, suggesting a synergistic relationship between wealth and health.</p>
            
            <div class="visualization">
                {fig1_html}
            </div>
            
            <div class="insights">
                <h4>🔍 Key Insights:</h4>
                <ul>
                    <li><strong>Nordic Dominance:</strong> Scandinavian countries consistently rank high in both GDP per capita and life expectancy</li>
                    <li><strong>Wealth-Health Correlation:</strong> Countries with higher GDP generally show better health outcomes</li>
                    <li><strong>Balanced Development:</strong> Top performers maintain strong performance across both economic and health metrics</li>
                    <li><strong>Policy Success:</strong> These nations demonstrate effective healthcare and economic policies</li>
                </ul>
            </div>
        </div>
        
        <!-- Visualization 2: Correlation Heatmap -->
        <div class="section">
            <h2>🔗 Happiness Factors Correlation Analysis</h2>
            <p>The correlation heatmap reveals the intricate relationships between different happiness indicators. Understanding these correlations is crucial for policymakers to identify which factors most strongly influence national well-being and where interventions might be most effective.</p>
            
            <div class="visualization">
                {fig2_html}
            </div>
            
            <div class="insights">
                <h4>🔍 Key Insights:</h4>
                <ul>
                    <li><strong>Strongest Correlations:</strong> GDP per capita and life expectancy show the highest correlation with happiness scores</li>
                    <li><strong>Social Factors:</strong> Family and social support demonstrate significant positive correlations</li>
                    <li><strong>Freedom Impact:</strong> Personal freedom correlates moderately but consistently with overall happiness</li>
                    <li><strong>Complex Relationships:</strong> Some factors show unexpected correlation patterns, suggesting nuanced policy implications</li>
                </ul>
            </div>
        </div>
        
        <!-- Visualization 3: Scatter Plot -->
        <div class="section">
            <h2>💰 Happiness vs. Economic Prosperity by Region</h2>
            <p>This scatter plot analysis examines the relationship between economic prosperity (GDP per capita) and happiness scores across different global regions. The color-coding by region reveals distinct patterns and outliers that inform our understanding of cultural and regional factors in well-being.</p>
            
            <div class="visualization">
                {fig3_html}
            </div>
            
            <div class="insights">
                <h4>🔍 Key Insights:</h4>
                <ul>
                    <li><strong>Regional Clustering:</strong> Clear clustering patterns emerge by geographic region</li>
                    <li><strong>Wealth Threshold:</strong> There appears to be a happiness threshold related to GDP levels</li>
                    <li><strong>Cultural Factors:</strong> Some regions achieve higher happiness with lower GDP, suggesting cultural influences</li>
                    <li><strong>Outlier Analysis:</strong> Notable outliers provide insights into unique national circumstances</li>
                </ul>
            </div>
        </div>
        
        <!-- Visualization 4: Regional Pie Chart -->
        <div class="section">
            <h2>🌍 Regional Happiness Distribution</h2>
            <p>The regional happiness distribution provides a macro-level view of how well-being varies across different parts of the world. This analysis helps identify which regions are thriving and which may need targeted international development support.</p>
            
            <div class="visualization">
                {fig4_html}
            </div>
            
            <div class="insights">
                <h4>🔍 Key Insights:</h4>
                <ul>
                    <li><strong>Western Dominance:</strong> Western Europe leads in average regional happiness scores</li>
                    <li><strong>Development Gaps:</strong> Significant disparities exist between developed and developing regions</li>
                    <li><strong>Regional Potential:</strong> Some regions show untapped potential for happiness improvement</li>
                    <li><strong>Global Inequality:</strong> The distribution highlights global well-being inequalities</li>
                </ul>
            </div>
        </div>
        
        <!-- Visualization 5: World Map -->
        <div class="section">
            <h2>🗺️ Global GDP Distribution with Health Context</h2>
            <p>The world map visualization provides a geographical perspective on economic prosperity, with health life expectancy data available through interactive tooltips. This global view helps identify regional economic patterns and their relationship to health outcomes.</p>
            
            <div class="visualization">
                {fig5_html}
            </div>
            
            <div class="insights">
                <h4>🔍 Key Insights:</h4>
                <ul>
                    <li><strong>Geographic Patterns:</strong> Clear geographic clustering of economic prosperity</li>
                    <li><strong>Resource Distribution:</strong> Natural resource endowments correlate with some high-GDP areas</li>
                    <li><strong>Development Corridors:</strong> Economic development often follows geographic and cultural corridors</li>
                    <li><strong>Health-Wealth Nexus:</strong> Interactive tooltips reveal the strong relationship between GDP and health outcomes</li>
                </ul>
            </div>
        </div>
        
        <!-- Methodology Section -->
        <div class="methodology">
            <h2>🔬 Methodology & Data Processing</h2>
            <h3>Data Preparation:</h3>
            <ul>
                <li><strong>Source:</strong> World Happiness Report 2016 dataset from IBM Skills Network</li>
                <li><strong>Preprocessing:</strong> Automated data type detection and correction</li>
                <li><strong>Missing Values:</strong> Numeric columns filled with mean values where appropriate</li>
                <li><strong>Quality Assurance:</strong> Comprehensive data validation and verification</li>
            </ul>
            
            <h3>Analytical Approach:</h3>
            <ul>
                <li><strong>Correlation Analysis:</strong> Pearson correlation coefficients calculated for all numeric variables</li>
                <li><strong>Regional Grouping:</strong> Countries analyzed by geographic and cultural regions</li>
                <li><strong>Top Performer Analysis:</strong> Rankings based on composite happiness scores</li>
                <li><strong>Interactive Visualization:</strong> Plotly-based charts for enhanced user engagement</li>
            </ul>
        </div>
        
        <!-- Conclusions & Recommendations -->
        <div class="section">
            <h2>🎯 Conclusions & Policy Recommendations</h2>
            
            <h3>Major Findings:</h3>
            <div class="key-findings">
                <div class="finding-card">
                    <h5>Economic Foundation</h5>
                    <p>GDP per capita remains the strongest predictor of national happiness, but diminishing returns suggest a threshold effect.</p>
                </div>
                <div class="finding-card">
                    <h5>Health Investment</h5>
                    <p>Life expectancy and health outcomes show consistent positive correlations with well-being across all regions.</p>
                </div>
                <div class="finding-card">
                    <h5>Social Cohesion</h5>
                    <p>Family support and social connections prove critical for happiness beyond economic factors.</p>
                </div>
                <div class="finding-card">
                    <h5>Governance Quality</h5>
                    <p>Trust in government and low corruption correlate significantly with higher happiness scores.</p>
                </div>
            </div>
            
            <h3>Policy Recommendations:</h3>
            <ul>
                <li><strong>Holistic Development:</strong> Balance economic growth with social and health investments</li>
                <li><strong>Healthcare Priority:</strong> Prioritize healthcare systems as fundamental to national well-being</li>
                <li><strong>Social Infrastructure:</strong> Invest in community support systems and social cohesion programs</li>
                <li><strong>Governance Reform:</strong> Strengthen institutions and reduce corruption for long-term happiness gains</li>
                <li><strong>Regional Cooperation:</strong> Facilitate knowledge sharing between high-performing and developing regions</li>
            </ul>
        </div>
        
        <!-- Footer -->
        <div class="footer">
            <p><strong>World Happiness Analysis Dashboard</strong></p>
            <p>Generated using Python, Pandas, and Plotly • Data from World Happiness Report 2016</p>
            <p>📊 Comprehensive analysis of {len(df)} countries across {len(df.columns)} indicators</p>
        </div>
    </div>
</body>
</html>
"""
            
            # Save the dashboard
            with open('world_happiness_dashboard.html', 'w', encoding='utf-8') as f:
                f.write(dashboard_html)
            
            print("✓ HTML Dashboard created successfully!")
            print("✓ File saved as: 'world_happiness_dashboard.html'")
            print("✓ Dashboard includes:")
            print("  - Executive Summary with key findings")
            print("  - All 5 interactive visualizations")
            print("  - Detailed insights for each chart")
            print("  - Methodology and recommendations")
            print("  - Professional styling and responsive design")
            print("\n🎉 Complete data analysis and dashboard generation finished!")
            
        else:
            print(f"⚠️ Insufficient attributes found for analysis.")
            print(f"Found: {available_attrs}")
            print(f"Required: {required_attrs}")
            
    except Exception as explore_error:
        print(f"Error during data exploration: {explore_error}")
        print(f"Please ensure all required libraries are installed: plotly, numpy")
    
except Exception as e:
    print(f"Error during data preparation: {e}")
First 5 rows of the dataset:
       Country          Region  Happiness Rank  Happiness Score  \
0      Denmark  Western Europe               1            7.526   
1  Switzerland  Western Europe               2            7.509   
2      Iceland  Western Europe               3            7.501   
3       Norway  Western Europe               4            7.498   
4      Finland  Western Europe               5            7.413   

   Lower Confidence Interval Upper Confidence Interval  \
0                      7.460                     7.592   
1                      7.428                      7.59   
2                      7.333                     7.669   
3                      7.421                     7.575   
4                      7.351                     7.475   

  Economy (GDP per Capita)   Family Health (Life Expectancy)  Freedom  \
0                  1.44178  1.16374                  0.79504  0.57941   
1                  1.52733  1.14524                  0.86303  0.58557   
2                  1.42666  1.18326                  0.86733  0.56624   
3                  1.57744  1.12690                  0.79579  0.59609   
4                  1.40598  1.13464                  0.81091  0.57104   

   Trust (Government Corruption)  Generosity  Dystopia Residual  
0                        0.44453     0.36171            2.73939  
1                        0.41203     0.28083            2.69463  
2                        0.14975     0.47678            2.83137  
3                        0.35776     0.37895            2.66465  
4                        0.41004     0.25492            2.82596  

Dataset shape: (157, 13)
Columns: ['Country', 'Region', 'Happiness Rank', 'Happiness Score', 'Lower Confidence Interval', 'Upper Confidence Interval', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Dystopia Residual']

==================================================
DATA PREPARATION
==================================================

1. Current data types:
Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Lower Confidence Interval        float64
Upper Confidence Interval         object
Economy (GDP per Capita)          object
Family                           float64
Health (Life Expectancy)          object
Freedom                           object
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
dtype: object


Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        157 non-null    object 
 1   Region                         157 non-null    object 
 2   Happiness Rank                 157 non-null    int64  
 3   Happiness Score                157 non-null    float64
 4   Lower Confidence Interval      153 non-null    float64
 5   Upper Confidence Interval      155 non-null    object 
 6   Economy (GDP per Capita)       156 non-null    object 
 7   Family                         157 non-null    float64
 8   Health (Life Expectancy)       155 non-null    object 
 9   Freedom                        157 non-null    object 
 10  Trust (Government Corruption)  157 non-null    float64
 11  Generosity                     157 non-null    float64
 12  Dystopia Residual              157 non-null    float64
dtypes: float64(6), int64(1), object(6)
memory usage: 16.1+ KB
None

2. Data type analysis:

Column 'Country':
  - Data type: object
  - Null values: 0
  - Unique values: 157
  - Sample values: ['Denmark', 'Switzerland', 'Iceland']

Column 'Region':
  - Data type: object
  - Null values: 0
  - Unique values: 10
  - Sample values: ['Western Europe', 'Western Europe', 'Western Europe']

Column 'Happiness Rank':
  - Data type: int64
  - Null values: 0
  - Unique values: 154
  - Value range: 1 to 157

Column 'Happiness Score':
  - Data type: float64
  - Null values: 0
  - Unique values: 154
  - Value range: 2.905 to 7.526

Column 'Lower Confidence Interval':
  - Data type: float64
  - Null values: 4
  - Unique values: 150
  - Value range: 2.732 to 7.46

Column 'Upper Confidence Interval':
  - Data type: object
  - Null values: 2
  - Unique values: 152
  - Sample values: ['7.592', '7.59', '7.669']

Column 'Economy (GDP per Capita)':
  - Data type: object
  - Null values: 1
  - Unique values: 156
  - Sample values: ['1.44178', '1.52733', '1.42666']

Column 'Family':
  - Data type: float64
  - Null values: 0
  - Unique values: 157
  - Value range: 0.0 to 1.18326

Column 'Health (Life Expectancy)':
  - Data type: object
  - Null values: 2
  - Unique values: 154
  - Sample values: ['0.79504', '0.86303', '0.86733']

Column 'Freedom':
  - Data type: object
  - Null values: 0
  - Unique values: 157
  - Sample values: ['0.57941', '0.58557', '0.56624']

Column 'Trust (Government Corruption)':
  - Data type: float64
  - Null values: 0
  - Unique values: 156
  - Value range: 0.0 to 0.50521

Column 'Generosity':
  - Data type: float64
  - Null values: 0
  - Unique values: 157
  - Value range: 0.0 to 0.81971

Column 'Dystopia Residual':
  - Data type: float64
  - Null values: 0
  - Unique values: 157
  - Value range: 0.81789 to 3.83772

3. Data type corrections:
Corrections applied:
  - 'Upper Confidence Interval': object → numeric
  - 'Economy (GDP per Capita)': object → numeric
  - 'Health (Life Expectancy)': object → numeric
  - 'Freedom': object → numeric

4. Data types after corrections:

Comparison of data types:
Column               Original        Current         Changed
-----------------------------------------------------------------
Country              object          object          No
Region               object          object          No
Happiness Rank       int64           int64           No
Happiness Score      float64         float64         No
Lower Confidence Interval float64         float64         No
Upper Confidence Interval object          float64         Yes
Economy (GDP per Capita) object          float64         Yes
Family               float64         float64         No
Health (Life Expectancy) object          float64         Yes
Freedom              object          float64         Yes
Trust (Government Corruption) float64         float64         No
Generosity           float64         float64         No
Dystopia Residual    float64         float64         No

5. Final dataset summary:
Shape: (157, 13)
Memory usage: 16456 bytes
Total null values: 13

==================================================
MISSING VALUES ANALYSIS AND TREATMENT
==================================================

6. Missing values analysis:
Columns with missing values:
                   Column  Missing_Count  Missing_Percentage
Lower Confidence Interval              4            2.547771
Upper Confidence Interval              3            1.910828
 Economy (GDP per Capita)              2            1.273885
 Health (Life Expectancy)              3            1.910828
                  Freedom              1            0.636943

7. Filling missing values with mean:

Numeric columns filled with mean values:
  - 'Lower Confidence Interval': 4 missing values filled with mean = 5.2686
  - 'Upper Confidence Interval': 3 missing values filled with mean = 5.4728
  - 'Economy (GDP per Capita)': 2 missing values filled with mean = 0.9518
  - 'Health (Life Expectancy)': 3 missing values filled with mean = 0.5533
  - 'Freedom': 1 missing values filled with mean = 0.371

8. Missing values verification after treatment:
✓ All missing values in numeric columns have been successfully filled!

Missing value treatment summary:
  - Total columns processed: 13
  - Columns with missing values before: 5
  - Numeric columns filled with mean: 5
  - Non-numeric columns skipped: 0
  - Remaining missing values: 0

==================================================
FINAL DATASET STATUS
==================================================
Shape: (157, 13)
Total missing values: 0
Memory usage: 36,477 bytes

Data types summary:
  - float64: 10 columns
  - object: 2 columns
  - int64: 1 columns

Data preparation and missing value treatment completed successfully!
Dataset is now ready for analysis!

==================================================
TOP 10 COUNTRIES ANALYSIS
==================================================

Available columns in the dataset:
 1. Country
 2. Region
 3. Happiness Rank
 4. Happiness Score
 5. Lower Confidence Interval
 6. Upper Confidence Interval
 7. Economy (GDP per Capita)
 8. Family
 9. Health (Life Expectancy)
10. Freedom
11. Trust (Government Corruption)
12. Generosity
13. Dystopia Residual

Identified potential columns:
Country columns: ['Country']
Ranking columns: ['Happiness Rank', 'Happiness Score']
GDP columns: ['Economy (GDP per Capita)']
Health columns: ['Health (Life Expectancy)']

Selected columns for analysis:
Country: Country
GDP per capita: Economy (GDP per Capita)
Healthy Life Expectancy: Health (Life Expectancy)
Ranking/Score: Happiness Score

Top 10 countries:
    Country  Happiness Score  Economy (GDP per Capita)  Health (Life Expectancy)
    Denmark            7.526                   1.44178                   0.79504
Switzerland            7.509                   1.52733                   0.86303
    Iceland            7.501                   1.42666                   0.86733
     Norway            7.498                   1.57744                   0.79579
    Finland            7.413                   1.40598                   0.81091
     Canada            7.404                   1.44015                   0.82760
Netherlands            7.339                   1.46468                   0.81231
New Zealand            7.334                   1.36066                   0.83096
  Australia            7.313                   1.44443                   0.85120
     Sweden            7.291                   1.45181                   0.83121
Error during visualization: make_subplots() got unexpected keyword argument(s): ['secondary_y']
Please check if the required columns exist in the dataset.

==================================================
DATA EXPLORATION - ADVANCED VISUALIZATIONS
==================================================

1. Creating sub-dataset with key attributes...
Identified attribute mappings:
  Economy: Economy (GDP per Capita)
  Family: Family
  Health: Health (Life Expectancy)
  Freedom: Freedom
  Trust: Trust (Government Corruption)
  Generosity: Generosity
  Happiness_Score: Happiness Score
  Region: Region
  Country: Country

Sub-dataset created with 7 attributes:
   Economy   Family   Health  Freedom    Trust  Generosity  Happiness_Score
0  1.44178  1.16374  0.79504  0.57941  0.44453     0.36171            7.526
1  1.52733  1.14524  0.86303  0.58557  0.41203     0.28083            7.509
2  1.42666  1.18326  0.86733  0.56624  0.14975     0.47678            7.501
3  1.57744  1.12690  0.79579  0.59609  0.35776     0.37895            7.498
4  1.40598  1.13464  0.81091  0.57104  0.41004     0.25492            7.413

2. Creating correlation heatmap...
✓ Correlation heatmap 'fig2' created successfully!

3. Creating scatter plot...
✓ Scatter plot 'fig3' created successfully!

4. Creating pie chart...
✓ Pie chart 'fig4' created successfully!

5. Creating world map...
✓ World map 'fig5' created successfully!

==================================================
DATA EXPLORATION COMPLETED
==================================================
✓ All visualizations created successfully!
✓ fig1: Top 10 Countries GDP & Health Bar Chart
✓ fig2: Correlation Heatmap
✓ fig3: Happiness vs GDP Scatter Plot
✓ fig4: Happiness by Region Pie Chart
✓ fig5: GDP World Map with Health Tooltip

==================================================
CREATING HTML DASHBOARD
==================================================
✓ HTML Dashboard created successfully!
✓ File saved as: 'world_happiness_dashboard.html'
✓ Dashboard includes:
  - Executive Summary with key findings
  - All 5 interactive visualizations
  - Detailed insights for each chart
  - Methodology and recommendations
  - Professional styling and responsive design

🎉 Complete data analysis and dashboard generation finished!
In [ ]: